# Basic packages
library(tidyverse)
library(ggthemes)
# Data Package
library(gapminder)
# Specialty Graph Packages
library(ggridges)
library(cowplot)
library(GGally)
# Colorscale and adjustment packages
library(viridis)
library(scales)
# Interactive packages
library(plotly)
# Forecasting packages
library(fable)
library(feasts)
library(tsibble)
library(tsbox)
library(Quandl)
library(dygraphs)
library(highcharter)
library(lubridate)
library(zoo)
# Qualitative packages
library(wordcloud2)
library(extrafont)
library(tidytext)
library(textdata)
library(sentimentr)
theme_new <- theme_fivethirtyeight(base_size=12#,base_family="Open Sans"
) %+replace% theme(panel.grid.major.y = element_line(colour = "grey80", size = 0.25), panel.grid.major.x = element_blank(), panel.background = element_rect(fill = "white"), plot.background = element_rect(fill = "white"), legend.background = element_rect(fill = "white"))
This section covers univariate graphs (one variable at a time.)
diamonds %>%
ggplot(aes(x = cut, fill = cut)) +
geom_bar() +
labs(title = "Title",
subtitle = "Subtitle") +
theme_new
diamonds %>%
ggplot(aes(x = price)) + geom_histogram(fill = "#afd7db", binwidth = 80) +
labs(title = "Title",
subtitle = "Subtitle",
y = "Count",
x = "Variable") +
theme_new #+ theme(axis.title = element_text(size = 10, color = "grey40"))
You could make a univariate boxplot… but why? This should probably be a bivariate plot most of the time.
diamonds %>%
ggplot(aes(x = price)) +
geom_boxplot(fill = "#afd7db") +
labs(title = "Title",
subtitle = "Subtitle",
x = "Variable") +
theme_new #+ theme(axis.title = element_text(size = 10, color = "grey40"))
You can put any graphs together in a grid. Here is just one example. Note: if you are using a different theme, you may have to adjust the margins here.
plt1 <- diamonds %>%
ggplot(aes(x = price)) +
geom_boxplot(fill = "#afd7db", width = .1, size = .3, outlier.alpha = .01) +
labs(title = "Title") +
theme_new +
theme(axis.text = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.subtitle = element_text(margin=margin(0,0,15,0)))
plt2 <- diamonds %>%
ggplot(aes(x = price)) +
geom_histogram(fill = "#afd7db", binwidth = 30) +
theme_new %+replace% theme(
plot.margin = margin(0,10,10,10)
)
cowplot::plot_grid(plt1, plt2,
ncol = 1, rel_heights = c(1, 2.5),
align = 'v', axis = 'lr')
Note: some people find it hard to compare areas in this visualization.
diamonds %>%
ggplot(aes(x = cut, fill = color)) + geom_bar() +
labs(title = "Title",
subtitle = "Subtitle",
x = "Variable") +
theme_new #+ theme(axis.title = element_text(size = 10, color = "grey40"))
diamonds %>%
ggplot(aes(x = cut, fill = color)) + geom_bar(position = "dodge") +
labs(title = "Title",
subtitle = "Subtitle",
x = "Variable") +
theme_new #+ theme(axis.title = element_text(size = 10, color = "grey40"))
diamonds %>%
ggplot(aes(x = clarity, fill = color)) +
geom_bar(position = "fill", width = 1) +
labs(title = "Title",
subtitle = "Subtitle") +
theme_new
This graph is the easiest to read of the multivariate bar graphs. Note, if you want to switch the orientation, switch the order from “. ~ variable” to “variable ~ .” May be smart to adjust axis grids as well.
diamonds %>%
ggplot(aes(y = clarity, fill = clarity)) +
geom_bar() +
labs(title = "Title",
subtitle = "Subtitle") +
facet_grid(. ~ color) +
theme_new +
theme(axis.text.x = element_text(angle = 90, size = 8))
diamonds %>%
ggplot(aes(x = color, y = clarity)) +
geom_count(color = "#afd7db") +
labs(title = "Title",
subtitle = "Subtitle") +
theme_new
diamonds %>%
count(cut, clarity) %>%
ggplot(aes(x = cut, y = clarity, fill = n)) +
geom_tile()+
scale_fill_distiller(palette = "GnBu") +
labs(title = "Title",
subtitle = "Subtitle") +
theme_new +
theme(legend.position = "right",
legend.direction = "vertical",
panel.grid.major.y = element_blank())
diamonds %>%
count(cut, clarity) %>%
ggplot(aes(x = cut, y = clarity, fill = n)) +
geom_tile()+
geom_text(aes(label = n), color = "black") +
scale_fill_distiller(palette = "GnBu") +
labs(title = "Title",
subtitle = "Subtitle") +
theme_new +
theme(legend.position = "none",
panel.grid.major.y = element_blank())
One categorical variable needs to be binary for this plot. Since I do not have one in the diamonds dataset, I am artificially creating one.
# Formatting data
diamonds %>% filter(cut %in% c("Fair", "Good")) %>%
count(clarity, cut) %>%
mutate(positive_negative = ifelse(cut == "Fair", -1, 1) * n) %>%
# Graphing
ggplot(aes(y = clarity, x = positive_negative, fill = cut)) +
geom_col() +
scale_fill_manual(values = c("#afd7db", "#fce3bd")) +
labs(title = "Title",
subtitle = "Subtitle") +
theme_new
I am not a big fan of the freqpoly for univariate analysis (density plots are much more attractive and bar charts are more common) but I think they do nicely for comparing the distributions of a categorical/num variable combo.
diamonds %>%
ggplot(aes(x = price, color = cut)) +
geom_freqpoly(size = 1, alpha = .6) +
labs(title = "Title",
subtitle = "Subtitle") +
theme_new
This plot works better with 2-4 categories. When there are more categories than that, I would stagger or stack them, as seen below. You may decide whether to remove or keep the y axis labels.
diamonds %>%
ggplot(aes(x = price, fill = cut, color = cut)) +
geom_density(alpha = .3) +
labs(title = "Title",
subtitle = "Subtitle",
x = "X Variable") +
theme_new +
theme(legend.position = "right",
legend.direction = "vertical",
axis.title = element_text(size = 10, color = "grey40"),
axis.text.y = element_blank())
### Density plot: facetted
diamonds %>%
ggplot(aes(x = price, fill = cut, color = cut)) +
geom_density(alpha = .3) +
labs(title = "Title",
subtitle = "Subtitle",
x = "X Variable") +
facet_wrap(~cut) +
theme_new +
theme(legend.position = "right",
legend.direction = "vertical",
axis.title = element_text(size = 10, color = "grey40"),
axis.text.y = element_blank())
diamonds %>%
ggplot(aes(x = price, y = clarity, fill = clarity, color = clarity)) +
geom_density_ridges(alpha = .3) +
labs(title = "Title",
subtitle = "Subtitle",
x = "X Variable",
y = "Y Variable") +
theme_new +
theme(legend.position = "none",
axis.title.x = element_text(size = 10, color = "grey40"))
diamonds %>%
ggplot(aes(x = price, y = clarity)) +
geom_density_ridges_gradient(aes(fill = ..x..)) +
labs(title = "Title",
subtitle = "Subtitle",
x = "X Variable",
y = "Y Variable") +
scale_fill_gradientn(
colours = c("#0D0887FF", "#ff9900", "#ffffff"))+
theme_new +
theme(legend.position = "none",
axis.title.x = element_text(size = 10, color = "grey40"))
One to many Here is the real purpose of boxplots. Not a huge fan of this color scheme with this particular graph though. I would pick something else. Notice in this case I have reordered this by the median of price, which is a big no-no if you have an ordinal variable, but I wanted the code here to show how you would do that.
diamonds %>%
ggplot(aes(x = fct_reorder(cut, price, median, na.rm = TRUE), y = price, fill = cut)) +
geom_boxplot(outlier.alpha = .1) +
labs(title = "Title",
subtitle = "Subtitle",
x = "") +
theme_new +
theme(axis.title = element_text(size = 10, color = "grey40"),
axis.title.x = element_text(hjust = .02, vjust = .4),
legend.position = "none")
One to many This looks like a rorschach ink blot but it is another way to show distribution.
diamonds %>%
filter(cut %in% c("Fair", "Good", "Very Good")) %>% # Not necessary for the graph.
ggplot(aes(x = cut, y = price, color = cut)) +
geom_dotplot(binaxis = "y", binwidth = 50, stackdir = "center") + # You'll have to change the binwidth for your particular dataset.
labs(title = "Title",
subtitle = "Subtitle",
x = "") +
theme_new +
theme(axis.title = element_text(size = 10, color = "grey40"),
axis.title.x = element_text(hjust = .02, vjust = .4),
legend.position = "none")
diamonds %>%
filter(cut %in% c("Fair", "Good", "Very Good")) %>% # Not necessary for the graph.
ggplot(aes(x = fct_reorder(cut, price, median, na.rm = TRUE), y = price, fill = cut)) +
geom_boxplot(outlier.alpha = 0) +
geom_dotplot(binaxis = "y", binwidth = 50, stackdir = "center", alpha = .1) + # You'll have to change the binwidth for your particular dataset.
labs(title = "Title",
subtitle = "Subtitle",
x = "") +
theme_new +
theme(axis.title = element_text(size = 10, color = "grey40"),
axis.title.x = element_text(hjust = .02, vjust = .4),
legend.position = "none")
This plot is reordered by the frequency of each factor.
diamonds %>%
ggplot(aes(x = fct_rev(fct_infreq(cut)), y = price)) +
geom_violin(alpha = .6, fill = "#afd7db", size = 0) +
geom_boxplot(width = 0.1,
fill = "#ffffff",
alpha = 0.5,
size = .2,
outlier.alpha = .01) +
stat_summary(
fun = "mean",
geom = "point",
shape = 23,
size = 1,
color = "red",
fill = "red",
stroke = 0.75,
alpha = .6) +
labs(title = "Title",
subtitle = "Subtitle",
x = "") +
theme_new +
theme(axis.title = element_text(size = 10, color = "grey40"),
axis.title.x = element_text(hjust = .02, vjust = .4),
legend.position = "none")
This plot works better with smaller datasets (which is why I took a smaller sample of the data)
diamonds %>%
sample_n(1000) %>% # This line is not needed for most graphs
ggplot(aes(x = fct_rev(fct_infreq(cut)), y = price)) +
geom_violin(alpha = .6, fill = "#afd7db", size = 0) +
geom_point(position = "jitter", alpha = 0.1, size = 1) +
labs(title = "Title",
subtitle = "Subtitle",
x = "") +
theme_new +
theme(axis.title = element_text(size = 10, color = "grey40"),
axis.title.x = element_text(hjust = .02, vjust = .4),
legend.position = "none")
One to one A col chart looks almost exactly like a bar graph, but it plots two variables instead of one, and you can choose the stat instead of the default “count”. This require a one to one relationship between variables like in the numeric col chart version. But unlike the num version, we can get around this by summarizing one of the categories by a stat. Here we are doing the mean, but we could do it by the median, standard deviation, min, max, or just about any other stat we can think of. Unless the cat variable is ordinal (like this one), I recommend you reorder it by the stat you are using. (The hashtagged line shows you how to do that.)
# This is not part of the graph in every case. (getting into one to one)
diamonds %>%
group_by(clarity) %>%
summarise(mean_price = mean(price)) %>%
arrange(mean_price) %>%
# This is part of the graph in every case.
ggplot(aes(x = clarity, y = mean_price, fill = clarity)) +
#ggplot(aes(x = fct_reorder(clarity, mean_price), y = mean_price, fill = clarity)) +
geom_col() +
labs(title = "Title",
subtitle = "Subtitle",
y = "Y Variable",
x = "") +
theme_new + theme(axis.title = element_text(size = 10, color = "grey40"))
A col chart looks almost exactly like a bar graph, but it plots two variables instead of one, and you can choose the stat instead of the default “count”. This require a one to one relationship between variables like in the numeric col chart version. But unlike the num version, we can get around this by summarizing one of the categories by a stat. Here we are doing the mean, but we could do it by the median, standard deviation, min, max, or just about any other stat we can think of. Unless the cat variable is ordinal (like this one), I recommend you reorder it by the stat you are using. (The hashtagged line shows you how to do that.)
# This is not part of the graph in every case.
diamonds %>%
group_by(clarity) %>%
summarise(mean_price = mean(price)) %>%
arrange(mean_price) %>%
# This is part of the graph in every case.
ggplot(aes(x = clarity, y = mean_price, fill = clarity)) +
geom_col() +
geom_text(aes(label = paste0("$", round(mean_price))), vjust = 1.5, colour = "white") +
labs(title = "Title",
subtitle = "Subtitle",
y = "Y Variable",
x = "") +
theme_new + theme(axis.title = element_text(size = 10, color = "grey40"), legend.position = "none")
# This is not part of the graph in every case.
diamonds %>%
group_by(cut) %>%
summarise(mean_depth = mean(depth),
sd_depth = sd(depth)) %>%
arrange(mean_depth) %>%
# This is part of the graph in every case.
ggplot(aes(x = cut, y = mean_depth, fill = cut)) +
geom_col() +
geom_errorbar(aes(ymin = mean_depth - sd_depth, ymax = mean_depth + sd_depth), width = 0.1) +
labs(title = "Title",
subtitle = "Subtitle",
y = "Y Variable",
x = "") +
theme_new + theme(axis.title = element_text(size = 10, color = "grey40"))
(Note the y axis has been changed with ylim. You may have to adjust or delete this setting.)
diamonds %>%
ggplot(aes(x = clarity, y = price)) +
geom_point(color = "#183054", alpha = .1, position = "jitter") +
ylim(0, 20000) +
labs(title = "Title",
subtitle = "Subtitle",
x = "x variable",
y = "y variable") +
theme_new +
theme(axis.title = element_text(size = 10, color = "grey40"))
One to one relationship, meaning each category can only have one associated numerical value. (You can also achieve this by running a stat like in the cat/num column chart example). This is good for a cat variable with lots of categories.
gapminder %>%
filter(continent == "Americas", year == "2007") %>%
ggplot(aes(x = fct_reorder(country, gdpPercap),
y = gdpPercap)) +
labs(title = "Title",
subtitle = "Subtitle",
x = "x variable",
y = "y variable") +
geom_segment(
aes(x=fct_reorder(country, gdpPercap),
xend=fct_reorder(country, gdpPercap),
y=0,
yend=gdpPercap),
color="grey") +
geom_point(size = 3, color = "#afd7db") +
coord_flip() +
theme_new + theme(panel.grid.major.y = element_blank())
This is technically a Cat/Cat/Num plot, but I am putting it here with Lolli.
gapminder %>%
filter(continent == "Americas") %>%
pivot_wider(names_from = year, values_from = c(lifeExp, pop, gdpPercap)) %>%
ggplot() +
geom_segment(aes(x=fct_reorder(country, gdpPercap_2007), xend=country, y=gdpPercap_1952, yend=gdpPercap_2007),color="grey") +
geom_point(aes(x = country, y = gdpPercap_1952), size = 3, color = "#afd7db") +
geom_point(aes(x = country, y = gdpPercap_2007), size = 3, color = "#f0b684") +
labs(title = "Title",
subtitle = "Subtitle",
x = "x variable",
y = "y variable") +
coord_flip() +
theme_new + theme(panel.grid.major.y = element_blank())
diamonds %>% ggplot(aes(price, fill = cut)) +
geom_histogram(data = select(diamonds, -cut), fill = "grey", show.legend = FALSE) +
geom_histogram(bins = 30) +
scale_fill_viridis_d() +
facet_wrap(~ cut) +
labs(title = "Title",
subtitle = "Subtitle") +
theme_new
diamonds %>% ggplot(aes(price, y = ..count..)) +
geom_density(data = select(diamonds, -cut), fill = "grey",color = "grey", show.legend = FALSE) +
geom_density(aes(fill = cut, color = cut), show.legend = FALSE) +
scale_fill_viridis_d(alpha = .5) +
facet_wrap(~ cut) +
labs(title = "Title",
subtitle = "Subtitle") +
theme_new
diamonds %>%
ggplot(aes(x = carat, y = price)) +
geom_point(color = "#183054", alpha = .1) +
stat_smooth(color = "#ffa781", alpha = .2, size = .7) +
labs(title = "Title",
subtitle = "Subtitle",
x = "x variable",
y = "y variable") +
theme_new +
theme(axis.title = element_text(size = 10, color = "grey40"))
(Note the y axis has been changed with ylim. You may have to adjust or delete this setting.)
diamonds %>%
ggplot(aes(x = carat, y = price)) +
geom_point(color = "#183054", alpha = .1) +
geom_smooth(method = lm, color = "#ffa781", size = .7) +
ylim(0, 20000) +
labs(title = "Title",
subtitle = "Subtitle",
x = "x variable",
y = "y variable") +
theme_new +
theme(axis.title = element_text(size = 10, color = "grey40"))
(Note the y axis has been changed with ylim. You may have to adjust or delete this setting.)
diamonds %>%
ggplot(aes(x = carat, y = price)) +
geom_point(color = "#183054", alpha = .1) +
geom_density2d(color = "#ffa781", alpha = .5) +
ylim(0, 20000) +
labs(title = "Title",
subtitle = "Subtitle",
x = "x variable",
y = "y variable") +
theme_new +
theme(axis.title = element_text(size = 10, color = "grey40"))
This is good for datasets with a lot of overlapping data. It is an alternative to jitter.
iris %>%
ggplot(aes(x = Petal.Length, y = Petal.Width)) +
geom_count(color = "#183054", alpha = .8) +
labs(title = "Title",
subtitle = "Subtitle",
x = "x variable",
y = "y variable") +
theme_new +
theme(axis.title = element_text(size = 10, color = "grey40"))
Also an option for numeric data with a lot of overlapping data.
iris %>%
group_by(Petal.Length, Petal.Width) %>%
summarise(count = n()) %>%
ggplot(aes(x = Petal.Length, y = Petal.Width, color = count)) +
geom_count(size = 5, alpha = .8) +
scale_color_viridis() +
labs(title = "Title",
subtitle = "Subtitle",
x = "x variable",
y = "y variable") +
theme_new +
theme(axis.title = element_text(size = 10, color = "grey40"))
Another option for over-plotting.
gapminder %>%
ggplot(aes(x = year, y = lifeExp)) +
geom_boxplot(aes(group = year), width = 1, ) +
labs(title = "Title",
subtitle = "Subtitle",
y = "Y Variable",
x = "") +
theme_new + theme(axis.title = element_text(size = 10, color = "grey40")) +
scale_x_continuous(breaks=seq(1952, 2007, 5)) # To increase frequency of axis ticks
(Note the y axis has been changed with ylim. You may have to adjust or delete this setting.)
# Hexbin chart with default option
diamonds %>%
ggplot(aes(x = depth, y = price)) +
geom_hex(bins = 70) +
labs(title = "Title",
subtitle = "Subtitle",
x = "x variable",
y = "y variable") +
scale_fill_continuous(type = "viridis") +
ylim(0,15000) +
theme_new
diamonds %>%
ggplot(aes(x = depth, y = price)) +
stat_density_2d(aes(fill = ..level..), geom = "polygon") +
scale_fill_viridis() +
labs(title = "Title",
subtitle = "Subtitle",
x = "x variable",
y = "y variable") +
scale_fill_continuous(type = "viridis") +
theme_new
diamonds %>%
ggplot(aes(x = depth, y = price)) +
geom_density2d() +
scale_fill_viridis() +
labs(title = "Title",
subtitle = "Subtitle",
x = "x variable",
y = "y variable") +
scale_fill_continuous(type = "viridis") +
theme_new
diamonds %>%
ggplot(aes(x = depth, y = price)) +
geom_bin2d(binwidth = c(1,1000)) +
scale_fill_viridis() +
labs(title = "Title",
subtitle = "Subtitle",
x = "x variable",
y = "y variable") +
scale_fill_continuous(type = "viridis") +
theme_new
You need a one to one relationship in your dataset to map this. For example, each numeric x only has one numeric y in your dataset.
# This is not part of the graph in every case
gapminder %>%
filter(country == "Bangladesh") %>%
# This is part of the graph in every case.
ggplot(aes(x = year, y = lifeExp)) +
geom_col(fill = "#afd7db") +
labs(title = "Title",
subtitle = "Subtitle",
y = "Y Variable",
x = "") +
theme_new + theme(axis.title = element_text(size = 10, color = "grey40")) +
scale_x_continuous(breaks=seq(1952, 2007, 5)) # To increase frequency of axis ticks
You need a one to one relationship in your dataset to map this, as well as positive and negative data. For example, each numeric x only has one numeric y in your dataset.
# Preparing the data (not necessary for the graph)
economics_long %>%
filter(variable == "unemploy") %>%
mutate(value2 = value - mean(value),
pos = ifelse(value2 >= 0, TRUE, FALSE)) %>%
# Creating the graph
ggplot(aes(x = date, y = value2, fill = pos)) +
geom_col(position = "identity") +
labs(title = "Title",
subtitle = "Subtitle",
y = "Y Variable",
x = "") +
theme_new +
theme(legend.position = "none")
Before you jump into visualizations with more than three variables, you should ask yourself: “Do I really need to put these together in the same visualization?” These graphs can get complex and overwhelming fast. Ask yourself whether your audience will really take the time to look at these graphs that you have created.
This accepts any type of variable. Rescale it to see it better. This is just one of many types of graphs GGally can produce. See the documentation below for more graphs and examples.
https://ggobi.github.io/ggally/reference/ggpairs.html
diamonds %>% ggpairs(columns = 1:5) + theme_new
This accepts any type of variable. Rescale it to see it better. This is just one of many types of graphs GGally can produce. See the documentation below for more graphs and examples.
https://ggobi.github.io/ggally/reference/ggpairs.html
diamonds %>% ggpairs(columns = 1:5, aes(fill = cut)) + theme_new
diamonds %>%
sample_n(50) %>%
ggparcoord(
columns = c(1,5:7),
groupColumn = 2,
showPoints = TRUE,
title = "Title",
alphaLines = 0.3
) +
scale_color_viridis(discrete=TRUE) +
theme_new +
theme(legend.key = element_rect(fill = "#ffffff"))
Note: you may have to adjust the height of this plot when you save it or change the theme to something taller.
diamonds %>%
ggplot(aes(x = color, fill = color)) +
geom_bar() +
facet_grid(clarity ~ cut) +
labs(title = "Title",
subtitle = "Subtitle") +
theme_new + theme(legend.position = "right",
legend.direction = "vertical",
legend.key.size = unit(.3, "cm"))
Again, this is not the best theme for the aspect ratio of this graph but it’s something you should change in ggsave.
diamonds %>%
ggplot(aes(x = color, fill = cut)) +
geom_bar() +
facet_wrap(~clarity) +
labs(title = "Title",
subtitle = "Subtitle") +
theme_new +
theme(legend.position = "right",
legend.direction = "vertical",
legend.key.size = unit(.3, "cm"))
diamonds %>% group_by(cut, color) %>% summarize(price = sum(price)) %>%
ggplot(aes(x = cut, y = color, fill = price)) +
geom_tile()+
scale_fill_viridis(option = "B") +
labs(title = "Title",
subtitle = "Subtitle",
y = "",
x = "") +
theme_new +
theme(legend.position = "right",
legend.direction = "vertical",
panel.grid.major.y = element_blank(),
axis.title = element_text())
diamonds %>% group_by(cut, color) %>% summarize(price = sum(price)) %>%
ggplot(aes(x = cut, y = color, fill = price)) +
geom_tile()+
geom_text(aes(label = price), color = "white") +
scale_fill_viridis(option = "B") +
labs(title = "Title",
subtitle = "Subtitle",
y = "",
x = "") +
theme_new +
theme(legend.position = "right",
legend.direction = "vertical",
panel.grid.major.y = element_blank(),
axis.title = element_text())
(Note the y axis has been changed with ylim. You may have to adjust or delete this setting.)
i1 <- diamonds %>% filter(clarity == "I1")
diamonds %>%
ggplot(aes(x = carat, y = price)) +
geom_point(color = "#183054", alpha = .1) +
geom_point(data = i1, color = "#ffa781", alpha = .2) +
ylim(0, 20000) +
labs(title = "Title",
subtitle = "Subtitle",
x = "x variable",
y = "y variable") +
theme_new +
theme(axis.title = element_text(size = 10, color = "grey40"))
(Note the y axis has been changed with ylim. You may have to adjust or delete this setting.)
diamonds %>%
ggplot(aes(x = carat, y = price, color = clarity)) +
geom_point(alpha = .1) +
ylim(0, 20000) +
labs(title = "Title",
subtitle = "Subtitle",
x = "x variable",
y = "y variable") +
theme_new +
theme(axis.title = element_text(size = 10, color = "grey40"),
legend.position = "right",
legend.direction = "vertical")
diamonds %>% ggplot() +
geom_point(aes(x = table, y = depth, col = cut)) +
labs(title = "Comparing table and depth of diamonds",
x = "Table",
y = "Depth") +
theme_new +
coord_cartesian(xlim = c(45,75), ylim = c(50, 75)) +
scale_x_continuous(breaks = seq(45,75,5)) +
scale_y_continuous(breaks = seq(50,75,5)) +
scale_color_brewer(palette = "Set1", name = "Diamond Cut Quality") +
facet_grid(. ~ clarity) +
theme(legend.key = element_rect(fill = "#ffffff"))
Custom colors have been included in this graph. These may be adjusted by changing the hexcode. Numerical data needs a one to one relationship.
# Preparing the data
gapminder %>%
filter(
country %in% c("China", "United States", "Haiti", "Afghanistan", "Myanmar")) %>%
# Graphing
ggplot(aes(x = year, y = lifeExp, color = country)) +
geom_line(size = 1) +
labs(title = "Title",
subtitle = "Subtitle",
caption = "Source: ") +
theme_new +
scale_color_manual(values = c("#f5c242", "#731130", "#139e90", "#296e91", "#ab5107")) +
theme(legend.key = element_rect(fill = "#ffffff"), legend.title = element_blank())
Custom colors have been included in this graph. These may be adjusted by changing the hexcode. Numerical data needs a one to one relationship.
# Preparing the data
gapminder %>%
filter(year >= 1990,
country %in% c("China", "United States", "Haiti", "Afghanistan", "Myanmar")) %>%
# Graphing
ggplot(aes(x = year, y = lifeExp, color = country)) +
geom_line(size = .5) +
geom_point() +
labs(title = "Title",
subtitle = "Subtitle",
caption = "Source: ") +
theme_new +
scale_color_manual(values = c("#f5c242", "#731130", "#139e90", "#296e91", "#ab5107")) +
theme(legend.key = element_rect(fill = "#ffffff"), legend.title = element_blank())
One to many relationship
# This is not part of the graph in every case
gapminder %>%
group_by(year, continent) %>%
summarize(mean_gdp = mean(gdpPercap)) %>%
# This is part of the graph in every case.
ggplot(aes(x = year, y = mean_gdp, fill = continent)) +
geom_area() +
labs(title = "Title",
subtitle = "Subtitle",
y = "Y Variable",
x = "") +
theme_new + theme(axis.title = element_text(size = 10, color = "grey40")) +
scale_x_continuous(breaks=seq(1952, 2007, 5)) # This changes the labels to show every 5 years starting at 1952 to match the data.
One to many relationship with an ordinal category (continent is non-ordered but let’s just pretend it is!)
# This is not part of the graph in every case
gapminder %>%
group_by(year, continent) %>%
summarize(mean_gdp = mean(gdpPercap)) %>%
# This is part of the graph in every case.
ggplot(aes(x = year, y = mean_gdp, fill = continent)) +
geom_area(colour = "black", size = .2, alpha = .4) +
scale_fill_brewer(palette = "Blues") +
labs(title = "Title",
subtitle = "Subtitle",
y = "Y Variable",
x = "") +
theme_new + theme(axis.title = element_text(size = 10, color = "grey40")) +
scale_x_continuous(breaks=seq(1952, 2007, 5)) # This changes the labels to show every 5 years starting at 1952 to match the data.
One to many relationship with an ordinal category (continent is non-ordered but let’s just pretend it is!)
# This is not part of the graph in every case
gapminder %>%
group_by(year, continent) %>%
summarize(mean_gdp = mean(gdpPercap)) %>%
# This is part of the graph in every case.
ggplot(aes(x = year, y = mean_gdp, fill = continent)) +
geom_area(position = "fill", colour = "black", size = .2, alpha = .4) +
scale_fill_brewer(palette = "Blues") +
labs(title = "Title",
subtitle = "Subtitle",
y = "Y Variable",
x = "") +
theme_new + theme(axis.title = element_text(size = 10, color = "grey40")) +
scale_x_continuous(breaks=seq(1952, 2007, 5)) # This changes the labels to show every 5 years starting at 1952 to match the data.
I have had a hard time getting this type of graph to look good with other data but I figured I would include it just in case.
faithfuld %>% ggplot() +
geom_tile(aes(waiting, eruptions, fill = density)) +
scale_fill_viridis_c() +
labs(title = "Title",
caption = "Source: ") +
theme_new +
theme(legend.position = "right",
legend.direction = "vertical")
fig <- diamonds %>%
ggplot(aes(x = cut, fill = color)) + geom_bar() +
labs(title = "Title",
subtitle = "Subtitle",
x = "Variable") +
theme_new
font <- list(
#family = "Open Sans",
size = 15,
color = "black"
)
label <- list(
bgcolor = "#FFFFFF",
bordercolor = "transparent",
font = font
)
ggplotly(fig) %>%
style(hoverlabel = label) #%>%
#config(displayModeBar = FALSE) #Include these lines if you want to remove the configuration bar
diamonds %>%
plot_ly(x = ~carat, y = ~depth, z = ~price, color = ~clarity) %>%
add_markers(opacity = .6) %>%
layout(title = "\ntitle",
scene = list(xaxis = list(title = 'carat'),
yaxis = list(title = 'depth'),
zaxis = list(title = 'price')))
See the geospatial RMD
I am going to comment this out so it doesn’t run, but here’s the code.
# dataset <- read.csv("dataset.csv")
# dataset$date <- as.Date(dataset$date)
# dataset_ts <- dataset %>%
# mutate(date = yearmonth(date)) %>% # You don't always need this line, but it often helps
# as_tsibble(key = c(variable),
# index = date)
# You only need this line if you are getting an error that you have exceeded the quandl anonymous user limit.
#Quandl.api_key("-kZ-iyK3QPY7erxZcczr")
# Downloading the data
retail_raw <- Quandl('FRED/RSAFSNA',
type = 'ts',
start_date = '1990-01-01')
# Making it into tsibble format
retail <- retail_raw %>%
as_tsibble() %>%
rename(
date = index,
retail_revenue = value
)
# With autoplot (faster but with less control)
retail %>%
autoplot()
# By hand (slower and with more control)
retail %>%
ggplot(aes(x = date, y = retail_revenue)) +
geom_line(color = "#3c5d87") +
stat_smooth(color = "#ffa781", alpha = .2, size = .7, span = .2) +
labs(
title = "Title",
subtitle = "Subtitle",
x = "",
y = "Retail Revenue",
caption = "Source: U.S. Census Bureau "
) + theme_new
retail %>%
model(
STL(retail_revenue)
) %>%
components() %>%
autoplot(color = "#3c5d87") +
labs(
title = "Title",
subtitle = "Subtitle",
x = "",
y = "Retail Revenue",
caption = "Source: U.S. Census Bureau "
) + theme_new
retail %>%
model(
ETS(retail_revenue) # This is the variable
) %>%
forecast(h = 24) %>%
autoplot(retail, color = "#3c5d87")+ # This is the dataset
labs(
title = "Retail forecast for two years",
subtitle = "",
x = "",
y = "Retail Revenue",
caption = "Source: U.S. Census Bureau "
) + theme_new
retail_raw %>% # notice this is the data in the format we pulled it from originally.
ts_pca() %>%
as_tsibble() %>%
rename(
date = index,
retail_revenue = value
) %>%
ggplot(aes(x = date, y = retail_revenue)) +
geom_line(color = "#3c5d87") +
stat_smooth(color = "#ffa781", alpha = .2, size = .7, span = .2) +
geom_hline(yintercept = 0, color = "white") +
labs(
title = "Retail revenue rate of change",
subtitle = "",
x = "",
y = "Change in Retail Revenue over time",
caption = "Source: U.S. Census Bureau"
) +
theme_new
ggplotly()
retail_raw %>%
dygraph(
main = "US Retail Sales"
) %>%
dyRangeSelector()
retail_raw %>%
hchart() %>%
hc_title(text = "US Retail Sales")
First we need to take our text file and sort out all the individual words and their frequency.
# We will be using the "sentences" dataset that is loaded with tidyverse. Typically instead of these lines you would just read in your dataset.
speaker = rep(c("Sam", "Jane", "Julie", "Whitaker"),
length.out = length(sentences))
test_text <- data.frame(sentences) %>%
mutate(text = sentences,
speaker = speaker) %>%
select(-sentences) # we are creating an artificial category column so we can do some comparisons. Don't worry about the code up to this point since it's just an example.
# Make a custom stopwords list that you can change over time
custom_stop <- c("get", "can", NA)
# Split into single words
single_split <- test_text %>%
unnest_tokens(word, text) %>% # word is new column name (don't change), text is existing text column name
anti_join(get_stopwords()) %>%
filter(!word %in% custom_stop)
# Count them (grouped by a category. If there is no category, just group by word.)
single_count <- single_split %>%
group_by(speaker, word) %>% # You may have only word to group by, or you may have another category to group by.
summarize(freq = n())
# Take a look at single_count to adjust custom stop words if needed
Remember these wordclouds are most effective when you filter the data into groups and compare them.
#compare groups by filtering for a category
df <- single_count %>% filter(speaker == "Sam") %>% ungroup() %>% select(word, freq)
# put custom colors here
colors <- c("#BE4422", "#497BB8", "#FBB861", "#8B99B6")
# Create the wordcloud. Adjust as needed. You may want to open and save it in a browser.
wordcloud2(df,
rotateRatio = 0,
color = rep_len(colors, nrow(df)),
#fontFamily = "Open Sans",
background = "#ffffff")
# grabs the top most frequent words from each speaker. Change n to change how many are shown.
single_count %>%
slice_max(order_by = freq, n = 5, with_ties = FALSE) %>%
ggplot(aes(y = fct_reorder(word, freq), x = freq, fill = word)) +
geom_col() +
facet_wrap(~speaker, scales = "free_y")+
labs(title = "Title",
subtitle = "Subtitle") +
theme_new +
theme(legend.position = "none")
Bigrams are two words together. You can do bigrams split among a categorical variable like we did for single words, or you can lump them together as we are doing here.
bigram_split <- test_text %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% # bigram is new column name, text is existing text column name
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word1 %in% custom_stop,
!word2 %in% custom_stop) %>%
unite(bigram, word1, word2, sep = " ")
bigram_count <- bigram_split %>%
group_by(bigram) %>%
summarize(freq = n())
Remember these wordclouds are most effective when you filter the data into groups and compare them. This is the most boring cloud ever because there are so few repeated bigrams in this dataset, but in a more robust text dataset this can be particularly interesting.
#compare groups by filtering for a category
df <- bigram_count
# put custom colors here
colors <- c("#BE4422", "#497BB8", "#FBB861", "#8B99B6")
# Create the wordcloud. Adjust as needed. You may want to open and save it in a browser.
wordcloud2(df,
rotateRatio = 0,
color = rep_len(colors, nrow(df)),
#fontFamily = "Open Sans",
background = "#ffffff")
bigram_count %>% arrange(desc(freq)) %>% head(n = 10) %>% # here we are just grabbing the top bigrams.
ggplot(aes(y = fct_reorder(bigram, freq), x = freq, fill = bigram)) +
geom_col() +
labs(title = "Title",
subtitle = "Subtitle") +
theme_new +
theme(legend.position = "none")
The sentiment analysis can be done using several different sentiment datasets. Each uses the single_split dataset we created before.
Splits words into emotions
emotion_sentiment <- single_split %>%
inner_join(get_sentiments("nrc"))
emotion_sentiment_grouped <- emotion_sentiment %>%
filter(speaker %in% c("Sam", "Julie", "Jane")) %>%
group_by(speaker, sentiment) %>%
summarize(count = n())
emotion_sentiment_grouped %>%
ggplot(aes(y = fct_reorder(sentiment, count), x = count, fill = sentiment)) +
geom_col() +
facet_wrap(~speaker) +
labs(title = "Title",
subtitle = "Subtitle") +
theme_new +
theme(legend.position = "none")
Splits words into weighted categories. More positive or more negative means the word has a more positive or negative connotation.
sentiment_weight <- single_split %>%
inner_join(get_sentiments("afinn"))
sentiment_weight %>%
ggplot(aes(x = value, fill = value>0)) +
geom_histogram(bins = 8) +
facet_wrap(~speaker) +
labs(title = "Title",
subtitle = "Subtitle") +
theme_new +
theme(legend.position = "none")
Splits words into just positive/negative
sentiment_binary <- single_split %>%
inner_join(get_sentiments("bing")) %>%
group_by(speaker, sentiment) %>%
summarize(count = n())
# Note: if categories have large disparities in how many words each has entirely, a better plot may be "position = 'fill'" so you can see the proportion and more easily compare them.
sentiment_binary %>%
ggplot(aes(x = speaker, y = count, fill = sentiment)) +
geom_col(position = "dodge") +
labs(title = "Title", subtitle = "Subtitle") +
theme_new
Splits text into sentences, then computes the average sentiment for each. Neutral sentences included.
This code does not take into consideration categories. If you would like to compare different speakers for example, you need to make a subset of your dataframe and then run this code separately for each speaker subset.
sentence_breakdown <- get_sentences(test_text[])
sentiment <- sentiment(sentence_breakdown) %>%
# You shouldn't have to adjust these lines at all.
mutate(sentiment_num = sentiment,
sentiment = ifelse(sentiment == 0, "neutral",
ifelse(sentiment > 0, "positive", "negative")))
colors <- c("#f4927c", "#b3b4b5", "#6397ce")
sentiment %>% ggplot(aes(x = sentiment_num, fill = sentiment)) +
geom_histogram() +
labs(title = "Title",
subtitle = "Subtitle") +
theme_new +
scale_fill_manual(values = colors)